import pandas as pd
import numpy as np

# 模拟邮件内容模板
spam_templates = [
    "获取免费{item}！点击{link}立即领取！",
    "恭喜您中奖{amount}元！详情咨询{phone}",
    "特价促销！原价{price}现在只要{discount}！",
    "您的账户已被冻结，请立即验证：{link}",
    "工作机会！月薪{salary}，联系{phone}"
]

normal_templates = [
    "你好，关于{topic}的项目进展如何？",
    "下周{day}的会议请准时参加",
    "请查收附件中的{document}文档",
    "{name}，之前讨论的问题已解决",
    "明天{time}记得参加团队会议"
]

# 生成随机文本的辅助函数
def generate_spam():
    template = np.random.choice(spam_templates)
    return template.format(
        item=np.random.choice(['iPhone', '购物券', '会员卡']),
        link='www.' + ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), 8)) + '.com',
        amount=np.random.randint(1000, 100000),
        phone=''.join([str(np.random.randint(0, 9)) for _ in range(11)]),
        price=np.random.randint(1000, 10000),
        discount=np.random.randint(100, 1000),
        salary=f"{np.random.randint(10, 50)}K"
    )

def generate_normal():
    template = np.random.choice(normal_templates)
    return template.format(
        topic=np.random.choice(['产品开发', '市场营销', '客户服务']),
        day=np.random.choice(['周一', '周二', '周三', '周四', '周五']),
        document=np.random.choice(['报告', '方案', '预算', '计划']),
        name=np.random.choice(['张经理', '李经理', '王经理']),
        time=f"{np.random.randint(9, 17)}:00"
    )

# 生成1000行数据
np.random.seed(42)
n_samples = 1000
n_spam = int(n_samples * 0.3)  # 30%是垃圾邮件

# 生成垃圾邮件和正常邮件
spam_texts = [generate_spam() for _ in range(n_spam)]
normal_texts = [generate_normal() for _ in range(n_samples - n_spam)]

# 合并数据
texts = spam_texts + normal_texts
labels = [1] * n_spam + [0] * (n_samples - n_spam)

# 打乱数据
indices = np.random.permutation(n_samples)
texts = [texts[i] for i in indices]
labels = [labels[i] for i in indices]

# 创建DataFrame
df = pd.DataFrame({
    'text': texts,
    'spam': labels
})

# 随机添加一些缺失值
mask = np.random.random(n_samples) < 0.05
df.loc[mask, 'spam'] = np.nan

# 保存到CSV文件
df.to_csv('spam.csv', index=False)
print("已创建spam.csv文件，共计{}行数据".format(len(df)))
print("\n数据预览:")
print(df.head())
print("\n标签分布:")
print(df['spam'].value_counts(dropna=False))